!! Copyright (C) 2010,2011,2012  Luca Bonaventura, MOX, Polimi
!!
!! This file is part of:
!!   FEMilaro -- Finite Element Method toolkit
!!
!! FEMilaro is free software; you can redistribute it and/or modify it
!! under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 3 of the License, or
!! (at your option) any later version.
!!
!! FEMilaro is distributed in the hope that it will be useful, but
!! WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
!! General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with FEMilaro; If not, see <http://www.gnu.org/licenses/>.
!!
!! author: Luca Bonaventura               <luca.bonaventura@polimi.it>

!>\brief
!!
!! Collection of exponential time integrators and auxiliary routines
!!
!! \n
!!
!! Exponential Rosenbrock methods have been introduced in <a
!! href="http://dx.doi.org/10.1137/S0036142995280572">[Hochbruck,
!! Lubich, SIAM J. Numer. Anal., 1997]</a> and <a
!! href="http://dx.doi.org/10.1137/S1064827595295337">[Hochbruck,
!! Lubich, Selhofer, SIAM J. Sci. Comp., 1998]</a>. They achieve
!! unconditional linear stability and high order by an approach that
!! is similar to that of the Rosenbrock-Wanner methods (see e.g. <a
!! href="http://dx.doi.org/10.1007/978-3-642-05221-7">[Hairer Wanner,
!! <em>Solving Ordinary Differential Equations II, Stiff and
!! Differential-Algebraic Problems</em>, Spinger 1996 (chapter
!! IV.7)]</a> and <a
!! href="http://dx.doi.org/10.1007/BF01932707">[Hairer, Lubich, Roche,
!! BIT Numerical Mathematics, 1989]</a>), whence their name.
!! Considering a generic nonlinear ODE system
!! \f{displaymath}{
!!  \begin{array}{rcll}
!!   \displaystyle \frac{d {\bf u} }{d t} & = & {\bf f} ( {\bf u}), &
!!   t \in (0,T] \\[2mm]
!!   {\bf u}(0) & = & {\bf u}_0
!!  \end{array}
!! \f}
!! and introducing for simplicity equally spaced time levels
!! \f$t^n=n\Delta t, \ n=0,\dots,N\f$, with \f$\Delta t=T/N\f$ the
!! second order exponential Rosenbrock method can be derived rewriting
!! at each time step the system to be solved as:
!! \f{displaymath}{
!!  \begin{array}{rcl}
!!   \displaystyle \frac{d {\bf u} }{d t} & = & \displaystyle
!!   \frac{\partial \bf f}{\partial \bf u} ( {\bf u}^n)({\bf u} -
!!   {\bf u}^n ) + {\bf R}({\bf u},{\bf u}^n) \\[2mm]
!!   {\bf u}(0) & = & {\bf u}^n,
!!  \end{array}
!! \f}
!! where \f${\bf u}^n\f$ is the previously computed numerical
!! solution. Dropping higher order terms and solving by the
!! exponential representation formula the resulting linear system, one
!! obtains
!! \f{displaymath}{
!!  \begin{array}{rcl}
!!   {\bf u}^{n+1} & = & {\bf u}^{n} + \Delta t\, \varphi(\Delta t \,
!!   A) \,{\bf f}({\bf u}^n) \\[2mm]
!!   A & = & \displaystyle \frac{\partial \bf f}{\partial \bf
!!   u}({\bf u}^n)
!!  \end{array}
!! \f}
!! where
!! \f{displaymath}{
!!   \varphi(z) = \frac{e^z-1}{z}.
!! \f}
!! This method is second order accurate and is easily seen to yield
!! the exact solution (and therefore to be unconditionally A-stable)
!! in the linear case.
!!
!! The main difficulty for the practical use of such methods concerns
!! the evaluation of the matrix function \f$\varphi(\Delta t \, A)\f$
!! (or, nore precisely, the action of this matrix on a given vector
!! \f${\bf w}\f$) when \f$A\f$ is a large, sparse matrix. To this end,
!! we consider here the following alternatives:
!! <ul>
!!  <li> Krylov space methods to reduce the problem to the computation
!!  of \f$\varphi(B)\f$, where \f$B\f$ is a small, dense matrix,
!!  which is then computed using a Pad&eacute; expansion
!!  <li> polynomial interpolation of the function \f$\varphi(z)\f$ on
!!  Leja points and evaluation of the resulting polynomial directly on
!!  the sparse matrix \f$\Delta t\,A\f$.
!! </ul>
!! Regardless of the chosen method, the main computational tool is
!! represented by the computation of sparse matrix-vector products
!! involving the Jacobian \f$A\f$ of the ODE system and a generic
!! vector \f${\bf v}\f$, for which we use the Jacobian-free
!! approximation
!! \f{displaymath}{
!!  A{\bf v} = \left( \frac{\partial \bf f}{\partial {\bf u}}(t^n,{\bf
!!  u}^n)\right){\bf v} \approx \frac{1}{\epsilon}\Big(
!!  {\bf f}(t^n,{\bf u}^n+\epsilon{\bf v}) -
!!  {\bf f}(t^n,{\bf u}^n) \Big).
!! \f}
!! This operation is provided by the module subroutine \c directional.
!! The computational cost is directly proportional to the number of
!! matrix-vector products which are computed, so that an efficient
!! method must yield an accurate approximation of \f$\varphi(\Delta
!! t\,A)\f$ using few sparse matrix-vector products. Also, we notice
!! that the computational cost per time-step of an exponential
!! Rosenbrock method using \f$m\f$ matrix-vector products is
!! comparable with the one of an \f$m\f$-stage Runge-Kutta method.
!!
!! \section stoppingcriteria Error control
!!
!! The error introduced by the time discretization has two sources:
!! <ul>
!!  <li> the fact that the ODE is linearized and the Jacobian matrix
!!  is kept constant during each time-step - this error vanishes for
!!  linear problems and can be reduced using higher order exponential
!!  Rosenbrock methods
!!  <li> the fact that the exponential matrix can not be computed
!!  exactly, but rather up to a tolerance \f$\epsilon_{\exp}\f$ - the
!!  exact meaning of \f$\epsilon_{\exp}\f$ depends on the chosen
!!  method to approximate the matrix exponential.
!! </ul>
!! The first error source is proportional to \f$\Delta t^2\f$ for the
!! exponential Euler method and does not depend on the chosen method
!! for the computation of the matrix exponential. The second error
!! source is controlled by the tolerance \f$\epsilon_{\exp}\f$, which
!! is supplied by the user using <tt>t_ti_init_data\%tol</tt>. For all
!! the methods implemented in this module, such tolerance has the
!! following meaning:
!! \f{displaymath}{
!!   \| \varphi(\Delta t \, A) \,{\bf f}({\bf u}^n) -
!!    \widetilde{\varphi(\Delta t \, A) \,{\bf f}({\bf u}^n)}\|_2 \leq
!!    \epsilon_{\exp}
!!   \| \varphi(\Delta t \, A) \,{\bf f}({\bf u}^n)\|_2
!! \f}
!! where a tilde indicates the numerical value. In other words,
!! \f$\epsilon_{\exp}\f$ controls the local truncation error, defined
!! as a relative error.
!!
!! \section krylovmethods Krylov space methods
!!
!! This method for the approximation of either the matrix exponential
!! \f$e^A\f$ or the matrix function \f$\varphi(A)\f$, both applied to
!! to a generic vector \f${\bf w}\f$, is due to Y. Saad, for instance
!! in <a href="http://dx.doi.org/10.1137/0729014">[Saad, SIAM J.
!! Numer. Anal., 1992]</a>. The first observation is that
!! \f$\varphi(A)\f$ can be computed as the exponential of a matrix
!! obtained from \f$A\f$ by adding one row and one column. Then, one
!! notices that \f$e^A{\bf w}\f$ can be approximated combining the
!! Arnoldi decomposition algorithm, expressing \f$e^A{\bf w}\f$ as
!! \f$e^H\f$ applied to a suitable projection of \f${\bf w}\f$ for a
!! small, full matrix \f$H\f$, with the use of Pad&eacute;
!! approximations and a scaling and squaring procedure to compute
!! \f$e^H\f$. This second step, namely the computation of the
!! exponential of a small, dense matrix, is summarized and implemented
!! in the subroutine expa(), and can be considered exact up to machine
!! precision. More challenging is the use of the Arnoldi procedure,
!! especially concerning the optimal choice of the dimension of the
!! Krylov space, which we recall here.
!!
!! Given an arbitrary \f$n\f$-dimensional vector \f${\bf v}\f$ and an
!! \f$n\times n\f$ matrix \f$A\f$, consider the \f$m\f$-dimensional
!! Krylov space \f$K_m(A)={\rm span}({\bf v}, A{\bf v},  A^2 {\bf
!! v},\dots, A^{m-1} {\bf v})\f$, i.e. the linear space generated by
!! \f${\bf v}\f$ and the first \f$m-1\f$ powers of \f$A\f$ applied to
!! \f${\bf v}\f$. The Arnoldi algorithm consists in the Gram-Schmidt
!! orthogonalization of the generators of \f$K_m({\bf A})\f$, thus
!! resulting in the determination of an orthonormal basis \f${\bf v}_1
!! = {\bf v}/\|{\bf v}\|, {\bf v}_2, \dots, {\bf v}_m\f$ for this
!! linear space. Denoting by \f$V_m = [ {\bf v}_1, \dots, {\bf
!! v}_m]\f$ the \f$n\times m\f$ orthogonal matrix associated to this
!! basis and by \f$H_m\f$ the upper Hessenberg matrix of the
!! coefficients computed during the orthogonalization, it can be
!! proven that, for each \f$m=1,\dots,n\f$,
!! \f{displaymath}{
!!  A V_m = V_m H_m + h_{m+1,m}{\bf v}_{m+1}{\bf e}_m^T,
!! \f}
!! where \f${\bf e}_i\f$ is the \f$i\f$-th unit vector of the
!! canonical basis. This relation can be exploited to transform
!! polynomials in \f$A\f$ into polynomials in \f$H_m\f$. In fact,
!! defining \f$\beta=\|{\bf v}\|_2\f$, we have (see also lemma 3.1 in
!! <a href="http://dx.doi.org/10.1137/0729014">[Saad, SIAM J. Numer.
!! Anal., 1992]</a>)
!! \f{displaymath}{
!!  \begin{array}{rcl}
!!   A^k{\bf v} & = & A^{k-1}A\,\beta V_m{\bf e}_1 \\
!!   & = &
!!   \beta A^{k-1}\left(V_mH_m + h_{m+1,m}{\bf v}_{m+1}{\bf e}_m^T
!!   \right){\bf e}_1 \\
!!   & = &
!!   \beta A^{k-1}V_mH_m {\bf e}_1 \\
!!   & = &
!!   \beta A^{k-2}A\,V_mH_m {\bf e}_1 \\
!!   & = &
!!   \beta A^{k-2}\left( V_m H_m + h_{m+1,m}{\bf v}_{m+1}{\bf e}_m^T
!!   \right) H_m {\bf e}_1 \\
!!   & = &
!!   \beta A^{k-2} V_m H_m^2{\bf e}_1 + \beta h_{m+1,m}A^{k-2}{\bf
!!   v}_{m+1}{\bf e}_m^T H_m {\bf e}_1 \\
!!   & = &
!!   \beta A^{k-3} V_m H_m^3{\bf e}_1 + \beta h_{m+1,m} \left(
!!   A^{k-3}{\bf v}_{m+1}{\bf e}_m^T H_m^2 {\bf e}_1 + A^{k-2}{\bf
!!   v}_{m+1}{\bf e}_m^T H_m {\bf e}_1 \right)\\
!!   & = & \ldots
!!  \end{array}
!! \f}
!! Considering that \f${\bf e}_m^T H_m^k {\bf e}_1 =
!! \left(H^k_m\right)_{m1}\f$ and that \f$H_m^k\f$ has \f$k\f$ nonzero
!! lower diagonals, we see that \f${\bf e}_m^T H_m^k {\bf e}_1 = 0\f$
!! for \f$k<m-1\f$, implying that
!! \f{displaymath}{
!!   A^k{\bf v} = \beta V_m H_m^k {\bf e}_1, \qquad k<m
!! \f}
!! and for a generic polynomial \f$q_k\f$ of degree \f$k<m\f$
!! \f{displaymath}{
!!   q_k(A){\bf v} = \beta V_m q_k(H_m) {\bf e}_1.
!! \f}
!!
!! Approximating a generic matrix function \f$f(A)\f$ by the
!! corresponding Taylor polynomial, this leads to the approximation
!! \f{displaymath}{
!!  f(A){\bf v} \approx \beta V_m f(H_m){\bf e}_1
!! \f}
!! whenever \f$f\f$ is an analytic function. Precise bounds are given
!! in <a
!! href="http://dx.doi.org/10.1137/S0036142995280572">[Hochbruck,
!! Lubich, SIAM J. Numer. Anal., 1997]</a> and in the literature
!! referenced therein. In particular, it can be stated precisely in
!! which way the approximation error depends on the matrix
!! characteristics. This idea can now be applied to the exponential
!! time integrator taking \f$f(z)=\varphi(z)\f$. The function
!! \f$\varphi(H_m)\f$ in turn can be expressed in terms of the matrix
!! exponential using <a
!! href="http://dx.doi.org/10.1002/fld.1902">[Schulze, Schmid,
!! Sesterhenn, Int. J. Num. Met. Fluids, 2009]</a>, equation (16),
!! \f{displaymath}{
!!  \exp\left(\left[\begin{array}{cc}
!!   H_m & {\bf e}_1 \\ {\bf 0}^T & 0
!!  \end{array}\right]\right) =
!!  \left[\begin{array}{cc}
!!   \exp(H_m) & \varphi(H_m)\,{\bf e}_1 \\ {\bf 0}^T & 1
!!  \end{array}\right],
!! \f}
!! and the exponential of the full matrix can be computed by the
!! highly optimized subroutine expa().
!!
!! Concerning the stopping criterion, following <a
!! href="http://dx.doi.org/10.1002/fld.1902">[Schulze, Schmid,
!! Sesterhenn, Int. J. Num. Met. Fluids, 2009]</a>, section 2.2, we
!! use the estimate
!! \f{displaymath}{
!!   \| \varphi(\Delta t \, A) \,{\bf f}({\bf u}^n) -
!!    \widetilde{\varphi(\Delta t \, A) \,{\bf f}({\bf u}^n)}\|_2
!!    \approx \beta\Delta t| h_{m+1,m}{\bf e}_m^T\varphi(\Delta
!!    t\,H_m){\bf e}_1 |
!! \f}
!! where the quantities \f$h_{m+1,m}\f$ and \f${\bf
!! e}_m^T\varphi(\Delta t\,H_m){\bf e}_1\f$ are already available as
!! parts of the algorithm (indeed, at the slight additional cost of
!! computing \f$\varphi(\Delta t\,H_m){\bf e}_1\f$ at each Arnoldy
!! iteration and not only at the last one).
!!
!! \section lejapointmethods Interpolation at Leja points
!!
!! This method is discussed in the following papers: <a
!! href="http://dx.doi.org/10.1016/j.cam.2003.11.015">[Caliari,
!! Vianello, Bergamaschi, J. Comput. Appl. Math., 2004]</a>, <a
!! href="http://dx.doi.org/10.1007/11758549_93">[Bergamaschi, Caliari,
!! Mart&iacute;nez, Vianello, Computational Science - ICCS 2006,
!! 2006]</a>, <a
!! href="http://dx.doi.org/10.1007/s00607-007-0227-1">[Caliari,
!! Computing, 2007]</a> and <a
!! href="http://dx.doi.org/10.1016/j.apnum.2008.03.021">[Caliari,
!! Ostermann, Appl. Num. Math., 2009]</a>. The underlying idea of
!! this approach is: for a generic matrix function \f$f(A)\f$
!! <ol>
!!  <li> define a polynomial interpolator \f$\Pi f\f$ of \f$f\f$ in
!!  \f$\mathbb{C}\f$
!!  <li> evaluate the polynomial at the matrix argument \f$A\f$, so
!!  that
!!  \f{displaymath}{
!!   f(A) \approx \Pi f(A)
!!  \f}
!!  or equivalently
!!  \f{displaymath}{
!!   f(A){\bf v} \approx \Pi f(A){\bf v}.
!!  \f}
!! </ol>
!! This algorithm thus is computable because the problem of evaluating
!! the matrix function, which is not (easily) computable, is split
!! into two steps which can be easily carried out: interpolation of a
!! known scalar function and evaluation of a matrix polynomial. A
!! convenient form for this algorithm is represented by the Newton
!! form of the polynomial interpolation, so that
!! \f{displaymath}{
!!  \Pi f(z) = \sum_{k=0}^n f[z_0,\ldots,z_k]\,\omega_k(z)
!! \f}
!! where \f$\omega_0=1\f$, \f$\omega_k(z) =
!! \prod_{i=0}^{k-1}(z-z_i)\f$ and \f$f[z_0,\ldots,z_k]\f$ are the
!! divided differences of \f$f\f$. Then one has
!! \f{displaymath}{
!!  f(A){\bf v} \approx \Pi f(A){\bf v} =
!!  f[z_0]{\bf v} + f[z_0,z_1](A-z_0I){\bf v} +
!!  f[z_0,z_1,z_2](A-z_1I)(A-z_0I){\bf v} + \ldots
!! \f}
!! from which it is clear that each additional term in the sum
!! requires one more divided difference of \f$f\f$ and one more term
!! of the form \f$(A-z_iI){\bf p}\f$ for some vector \f${\bf p}\f$.
!! The most critical element of this method is the choice of the
!! interpolation points \f$z_i\f$, which in the algorithm considered
!! here are chosen as the real Leja points on an interval which
!! contains the extreme eigenvalues of \f$A\f$ (see <a
!! href="http://dx.doi.org/10.1007/BF02017352">[Reichel, BIT Numerical
!! Mathematics, 1990]</a> and <a
!! href="http://dx.doi.org/10.1016/j.cam.2003.11.015">[Caliari,
!! Vianello, Bergamaschi, J. Comput. Appl. Math., 2004]</a>). Other
!! two relevant aspects for the implementation are numerically stable
!! algorithms to compute the Leja points and the divided differences
!! of \f$f\f$, for which we refer to <a
!! href="http://etna.mcs.kent.edu/vol.7.1998/pp124-140.dir/pp124-140.pdf">[Baglama,
!! Calvetti, Reichel, Electronic Transactions on Numerical Analysis,
!! 1998]</a> and <a
!! href="http://dx.doi.org/10.1007/s00607-007-0227-1">[Caliari,
!! Computing, 2007]</a>. Finally, concerning the stopping criterion,
!! we observe that
!! \f{displaymath}{
!!  \varphi(\Delta t\,A){\bf f}({\bf u}^n) \approx
!!  \varphi[z_0]\underbrace{{\bf f}({\bf u}^n)}_{{\bf p}_0} +
!!  \varphi[z_0,z_1]\underbrace{(\Delta t\,A-z_0I) {\bf f}({\bf
!!  u}^n)}_{{\bf p}_1} +
!!  \varphi[z_0,z_1,z_2]\underbrace{(\Delta t\,A-z_1I)(\Delta
!!  t\,A-z_0I) {\bf f}({\bf u}^n)}_{{\bf p}_2} + \ldots
!! \f}
!! so we stop the iterations when the last \c ncheck terms satisfy
!! \f{displaymath}{
!!  |\varphi[z_0,\ldots,z_k]|\,\|{\bf p}_k\|_2 \leq
!!  \|\widetilde{\varphi(\Delta t\,A){\bf f}({\bf u}^n)}\|_2.
!! \f}
!!
!! \subsection Lejarescaling Rescaling of the interpolation nodes
!!
!! As discussed in <a
!! href="http://dx.doi.org/10.1016/j.cam.2003.11.015">[Caliari,
!! Vianello, Bergamaschi, J. Comput. Appl. Math., 2004]</a>, it is
!! useful to rescale the interpolation nodes on the interval
!! \f$[-2\,,2]\f$. To this end, let us introduce the affine map
!! \f{displaymath}{
!!  z = c+\gamma\xi, \qquad \xi = \frac{z-c}{\gamma},
!! \f}
!! with \f$c=(a+b)/2\f$, \f$\gamma=(b-a)/4\f$, so that
!! \f$z\in[a\,,b]\implies \xi\in[-2\,,2]\f$. We then define
!! \f{displaymath}{
!!  \hat{\varphi}(\xi) = \varphi(c+\gamma\xi)
!! \f}
!! and note that
!! \f{displaymath}{
!!  \varphi[z_0,\ldots,z_k] =
!!  \frac{1}{\gamma^k}\hat{\varphi}[\xi_0,\ldots,\xi_k], \qquad
!!  \omega_k(z) = \gamma^k \prod_{i=0}^{k-1}(\xi-\xi_i) =
!!  \gamma^k\omega_k(\xi).
!! \f}
!! The Newton polynomial can thus be written as
!! \f{displaymath}{
!!  \Pi \varphi(z) = \sum_{k=0}^n \hat{\varphi}[\xi_0,\ldots,\xi_k]\,
!!   \omega_k(\xi(z)).
!! \f}
!! Now, setting \f$z=\Delta t\,A\f$ implies \f$\xi=\frac{\Delta
!! t}{\gamma}\,A - \frac{c}{\gamma}I\f$ and we have the computationally
!! more stable expression
!! \f{displaymath}{
!!  \varphi(\Delta t\,A){\bf f}({\bf u}^n) \approx
!!  \hat{\varphi}[\xi_0]\underbrace{{\bf f}({\bf u}^n)}_{{\bf p}_0} +
!!  \hat{\varphi}[\xi_0,\xi_1]\underbrace{\left( \frac{\Delta
!!  t}{\gamma}A - \left(\frac{c}{\gamma}+\xi_0\right)I\right) {\bf
!!  f}({\bf u}^n)}_{{\bf p}_1} +
!!  \hat{\varphi}[\xi_0,\xi_1,\xi_2]\underbrace{\left( \frac{\Delta
!!  t}{\gamma}A-\left(\frac{c}{\gamma}+\xi_1\right)I\right)
!!  \left(\frac{\Delta
!!  t}{\gamma}A-\left(\frac{c}{\gamma}+\xi_0\right)I\right) {\bf
!!  f}({\bf u}^n)}_{{\bf p}_2} + \ldots
!! \f}
!!
!! \section Parallel implementation
!!
!! Most of the parallel details should be implemented in the \c c_stv
!! methods.
!! \note We assume here that the result of the scalar product is
!! available on <em>all</em> processors.
!<----------------------------------------------------------------------
module mod_expo

!-----------------------------------------------------------------------

 use mod_messages, only: &
   mod_messages_initialized, &
   error,   &
   warning, &
   info

 use mod_kinds, only: &
   mod_kinds_initialized, &
   wp, wp_p, wp_r

 use mod_mpi_utils, only: &
   mod_mpi_utils_initialized, &
   mpi_logical, mpi_integer, wp_mpi, &
   mpi_comm_world, &
   mpi_bcast

 use mod_state_vars, only: &
   mod_state_vars_initialized, &
   c_stv

 use mod_linal, only: &
   mod_linal_initialized, &
   invmat

 use mod_time_integrators_base, only: &
   mod_time_integrators_base_initialized, &
   c_ode, c_ods, c_tint, t_ti_init_data, t_ti_step_diag

!-----------------------------------------------------------------------
 
 implicit none

!-----------------------------------------------------------------------

! Module interface

 public :: &
   mod_expo_constructor, &
   mod_expo_destructor,  &
   mod_expo_initialized, &
   t_erb2kry, t_erb2lej, &
   c_expo, erb2kry_step_internals, erb2lej_step_internals

 private

!-----------------------------------------------------------------------

! Module types

 !> This base type is provided to simplify the implementation of
 !! the split exponential method on top of this module.
 type, abstract :: c_expo_data
 end type c_expo_data

 !> Same as \c c_expo_data
 type, extends(c_tint), abstract :: c_expo
  class(c_expo_data), allocatable :: data
 end type c_expo

 !> Additional work arrays and data for the Krylov space method.
 !!
 !! These objects are bundled in this type, ratherthen being included
 !! directly in \c t_erb2kry, to simplify the calls to \c
 !! erb2kry_step_internals.
 type, extends(c_expo_data) :: t_expokry_data
  !> Size of the Krylov space (<tt>mk+1</tt> Krylov vectors will be
  !! generated, accounting for the extended matrix \f$\tilde{V}\f$)
  integer :: mk = 10
  !> Krylov space tolerance
  real(wp) :: kry_tol = 1.0e-6_wp
  !> Hessenberg matrix \f$H_m\f$
  real(wp), allocatable :: h(:,:)
  !> exponential matrix
  real(wp), allocatable :: ex(:,:)
  !> identity matrix
  real(wp), allocatable :: eye(:,:)
  !> MPI specific data
  integer :: mpi_id = 0
  integer :: mpi_comm = mpi_comm_world
 end type t_expokry_data

 !> 2nd order exponential Rosenbrock method, Krylov space version
 !!
 !! The time step is computed as
 !! \f{displaymath}{
 !!  {\bf u}^{n+1} = {\bf u}^{n} + \Delta t\, \varphi(\Delta t\,A)
 !!  {\bf f}({\bf u}^n),
 !! \f}
 !! where
 !! \f{displaymath}{
 !!   \varphi(\Delta t\,A){\bf f}({\bf u}^n)\approx \beta V_m
 !!   \varphi(\Delta t\,H_m){\bf e}_1
 !! \f}
 !! (see the general comments in \ref krylovmethods).
 !!
 !! A comment is required concerning the presence of scaling factor
 !! \f$\Delta t\f$, since in fact we deal here with the matrix
 !! \f$B=\Delta t\,A\f$. To this end, let us define the matrices
 !! \f$V^A\f$, \f$H^A\f$ and \f$V^B\f$, \f$H^B\f$, corresponding to
 !! the Arnoldi decomposition of \f$A\f$ and \f$B\f$, respectively.
 !! Considering the definition of the Krylov space, it is clear that
 !! \f$V^A=V^B\f$. Comparing the decomposition of the two matrices, it
 !! follows that \f$H^B=\Delta t\,H^A\f$. In the present
 !! implementation, we apply the Arnoldi procedure to \f$A\f$,
 !! obtaining \f$H_m=H^A_m\f$, and then compute the exponential of
 !! \f$\Delta t\,H_m\f$.
 !!
 !! Summarizing, computing the time-step proceeds as follows
 !! <ul>
 !!  <li> compute a basis of the Krylov space generated by \f$A\f$ and
 !!  \f${\bf f}({\bf u}^n)\f$ and orthogonalize it with the Arnoldi
 !!  procedure in order to obtain \f$V_m\f$ and \f$H_m\f$
 !!  <li> compute \f$\Delta t\,\varphi(\Delta t\,H_m)\,{\bf e}_1\f$
 !!  using
 !!  \f{displaymath}{
 !!   \exp\left(\left[\begin{array}{cc}
 !!    \Delta t\,H_m & \Delta t\,{\bf e}_1 \\ {\bf 0}^T & 0
 !!    \end{array}\right]\right) = \left[\begin{array}{cc} \exp(\Delta
 !!    t\,H_m) & \Delta t\,\varphi(\Delta t\,H_m)\,{\bf e}_1 \\ {\bf
 !!    0}^T & 1 \end{array}\right]
 !!  \f}
 !! (see <a href="http://dx.doi.org/10.1002/fld.1902">[Schulze,
 !! Schmid, Sesterhenn, Int. J. Num. Met. Fluids, 2009]</a>, equation
 !! (16), and <a href="http://dx.doi.org/10.1137/0729014">[Saad, SIAM
 !! J.  Numer. Anal., 1992]</a>, equation (13) ).
 !!  <li> compute \f$\Delta t\, \varphi( \Delta t\,A) {\bf f}({\bf
 !!  u}^n)\f$ as a linear combination of the Krylov vectors.
 !! </ul>
 !!
 !! The stopping criterion, according to \ref krylovmethods, should be
 !! \f{displaymath}{
 !!  \beta\Delta t| h_{m+1,m}{\bf e}_m^T\varphi(\Delta
 !!   t\,H_m){\bf e}_1 | \leq \epsilon_{\exp}\|\varphi(\Delta
 !!   t\,A)\,{\bf f}({\bf u}^n)\|_2.
 !! \f}
 !! Since recomputing the full right-hand-side of this expression at
 !! each Arnoldi iteration would imply a significant cost, we use the
 !! simplification
 !! \f{displaymath}{
 !!  \beta\Delta t| h_{m+1,m}{\bf e}_m^T\varphi(\Delta t\,H_m){\bf
 !!  e}_1 | \leq \epsilon_{\exp}\| {\bf f}({\bf u}^n)\|_2 =
 !!  \epsilon_{\exp}\beta,
 !! \f}
 !! i.e.
 !! \f{displaymath}{
 !!  \Delta t| h_{m+1,m}{\bf e}_m^T\varphi(\Delta t\,H_m){\bf e}_1 |
 !!  \leq \epsilon_{\exp}.
 !! \f}
 !!
 !! \section erb2kry_setup Initialization
 !!
 !! The maximum size of the Krylov space and the tolerance for the
 !! exponential computation are taken from <tt>init_data\%dim</tt> and
 !! <tt>init_data\%tol</tt>, respectively.
 type, extends(c_expo) :: t_erb2kry
 contains
  procedure, pass(ti) :: init  => erb2kry_init
  procedure, pass(ti) :: step  => erb2kry_step
  procedure, pass(ti) :: clean => erb2kry_clean
 end type t_erb2kry

 !> Analogous to \c t_expokry_data
 type, extends(c_expo_data) :: t_expolej_data
  !> Tolerance
  real(wp) :: leja_tol = 1.0e-6_wp
  !> Maximum number of interpolation points
  !!
  !! This is the number of the divided differences that must be
  !! computed. The number of divided differences that can be
  !! considered accurate depend on the time step, and for double
  !! precision one can expect roughly 100 meaningful values.
  integer :: nmax = 20
  !> Coefficient \f$c\f$ of the affine transformation
  !! \f$z=c+\gamma\xi\f$
  real(wp) :: cc
  real(wp) :: gg !< coefficient\f$\gamma\f$
  !> Leja points (zero based array)
  real(wp), allocatable :: xi(:)
  !> divided differences (zero based array)
  real(wp), allocatable :: dd(:)
  !> MPI specific data
  integer :: mpi_id = 0
  integer :: mpi_comm = mpi_comm_world
 end type t_expolej_data

 !> 2nd order exponential Rosenbrock method, Leja point version
 !!
 !! This implementation follows the pseudocode given in <a
 !! href="http://dx.doi.org/10.1016/j.cam.2003.11.015">[Caliari,
 !! Vianello, Bergamaschi, J. Comput. Appl. Math., 2004]</a>.
 !!
 !! The time-step is computed as follows:
 !! <ul>
 !!  <li> \f${\bf u}^{n+1} = \varphi(\Delta t\,A){\bf f}({\bf u}^n)\f$
 !!  (computed interpolating at the Leja points)
 !!  <li> \f${\bf u}^{n+1} = \Delta t\,{\bf u}^{n+1}\f$
 !!  <li> \f${\bf u}^{n+1} = {\bf u}^{n+1} + {\bf u}^{n}\f$.
 !! </ul>
 !! The first step, which is the most delicate one, is computed as
 !! \f{displaymath}{
 !!  \varphi(\Delta t\,A){\bf f}({\bf u}^n) =
 !!  \hat{\varphi}[\xi_0]\underbrace{{\bf f}({\bf u}^n)}_{{\bf p}_0} +
 !!  \hat{\varphi}[\xi_0,\xi_1]\underbrace{(\alpha A-\beta_0I)
 !!  \underbrace{{\bf f}({\bf u}^n)}_{{\bf p}_0}}_{{\bf p}_1} +
 !!  \hat{\varphi}[\xi_0,\xi_1,\xi_2]\underbrace{(\alpha
 !!  A-\beta_1I)\underbrace{(\alpha A-\beta_0I) \underbrace{{\bf f}({\bf
 !!  u}^n)}_{{\bf p}_0}}_{{\bf p}_1}}_{{\bf p}_2} + \ldots
 !! \f}
 !! with \f$\alpha=\frac{\Delta t}{\gamma}\f$ and
 !! \f$\beta_k=\frac{c}{\gamma}+\xi_k\f$, so that at each iteration we
 !! have to compute
 !! \f{displaymath}{
 !!  {\bf p}_k = (\alpha A-\beta_{k-1}I){\bf p}_{k-1} =
 !!  \alpha A{\bf p}_{k-1} -\beta_{k-1}{\bf p}_{k-1}.
 !! \f}
 !!
 !! The stopping criterion is defined as discussed in \ref
 !! lejapointmethods.
 !!
 !! \section erb2lej_setup Initialization
 !!
 !! The maximum number of Leja points, the tolerance for the
 !! exponential computation and the bounds of the interpolation
 !! interval are taken from <tt>init_data\%dim</tt>,
 !! <tt>init_data\%tol</tt> and <tt>init_data\%ir1(1:2)</tt>,
 !! respectively.
 type, extends(c_expo) :: t_erb2lej
 contains
  procedure, pass(ti) :: init  => erb2lej_init
  procedure, pass(ti) :: step  => erb2lej_step
  procedure, pass(ti) :: clean => erb2lej_clean
 end type t_erb2lej

! Module variables

 ! public members
 logical, protected ::               &
   mod_expo_initialized = .false.

 ! private members

 character(len=*), parameter :: &
   this_mod_name = 'mod_expo'

 ! These would be the parameters for the thord order Rosenbrock method
 !real, parameter :: alpha=4./3., beta1=1.-1./(3.*alpha*alpha), &
 !                       beta2=1./(3.*alpha*alpha), gamma=0.5, igamma=2.

!-----------------------------------------------------------------------

contains

!-----------------------------------------------------------------------

 subroutine mod_expo_constructor()
  character(len=*), parameter :: &
    this_sub_name = 'constructor'

   !Consistency checks ---------------------------
   if( (mod_messages_initialized             .eqv..false.) .or. &
       (mod_kinds_initialized                .eqv..false.) .or. &
       (mod_mpi_utils_initialized            .eqv..false.) .or. &
       (mod_state_vars_initialized           .eqv..false.) .or. &
       (mod_linal_initialized                .eqv..false.) .or. &
       (mod_time_integrators_base_initialized.eqv..false.) ) then
     call error(this_sub_name,this_mod_name, &
                'Not all the required modules are initialized.')
   endif
   if(mod_expo_initialized.eqv..true.) then
     call warning(this_sub_name,this_mod_name, &
                  'Module is already initialized.')
   endif
   !----------------------------------------------

   mod_expo_initialized = .true.
 end subroutine mod_expo_constructor

!-----------------------------------------------------------------------
 
 subroutine mod_expo_destructor()
  character(len=*), parameter :: &
    this_sub_name = 'destructor'
   
   !Consistency checks ---------------------------
   if(mod_expo_initialized.eqv..false.) then
     call error(this_sub_name,this_mod_name, &
                'This module is not initialized.')
   endif
   !----------------------------------------------

   mod_expo_initialized = .false.
 end subroutine mod_expo_destructor

!-----------------------------------------------------------------------

 subroutine erb2kry_init(ti,ode,dt,t,u0,ods,init_data,step_diag)
  class(c_ode),  intent(in)    :: ode
  real(wp),      intent(in)    :: dt, t
  class(c_stv),  intent(inout) :: u0
  class(c_ods),  intent(inout) :: ods
  class(t_erb2kry), intent(out) :: ti
  type(t_ti_init_data), optional, intent(in)  :: init_data
  type(t_ti_step_diag), optional, intent(out) :: step_diag
 
  integer :: mk1, i
  character(len=*), parameter :: &
    this_sub_name = 'erb2kry_init'

   allocate(t_expokry_data::ti%data)
   select type(data=>ti%data); type is(t_expokry_data)

   ti%dt = dt

   ! work arrays are stored in the additional components while the
   ! krylov space is stored in work (together with two other arrays
   ! used for for the rhs and as temporary storage)
   if(present(init_data)) then
     data%mk       = init_data%dim
     data%kry_tol  = init_data%tol
     data%mpi_id   = init_data%mpi_id
     data%mpi_comm = init_data%mpi_comm
   endif
   mk1 = data%mk + 1 ! use default value
   allocate(    data%h(mk1,mk1) , &
               data%ex(mk1,mk1) )
   data%h = 0.0_wp; data%ex = 0.0_wp
   if(data%mpi_id.eq.0) then
     allocate(data%eye(mk1,mk1) ); data%eye = 0.0_wp
     do i=1,mk1
       data%eye(i,i)=1.0_wp
     enddo
   endif
   call u0%source_vect(ti%work,data%mk+3)
   ! Diagnostics
   if(present(step_diag)) then
     step_diag%name_d1 = 'ti_res'
     step_diag%name_d2 = 'ti_H'
     allocate(step_diag%d1(mk1) , step_diag%d2(mk1,mk1))
   endif

   end select

 end subroutine erb2kry_init
 
!-----------------------------------------------------------------------

 !> 2nd order exponential Rosenbrock method
 subroutine erb2kry_step(unew,ti,ode,t,unow,ods,step_diag)
  class(t_erb2kry), intent(inout) :: ti
  class(c_ode),  intent(in)    :: ode
  real(wp),      intent(in)    :: t
  class(c_stv),  intent(in)    :: unow
  class(c_ods),  intent(inout) :: ods
  class(c_stv),  intent(inout) :: unew
  type(t_ti_step_diag), optional, intent(inout) :: step_diag
   
  integer :: ikry, irhs, itmp

   ! Indexes of some working arrays for erb2kry_step_internals
   select type(data=>ti%data); type is(t_expokry_data)
     ikry = data%mk+1 ! we generate mk+1 Krylov vectors 
     irhs = data%mk+2 ! rhs stored after the Krylov vectors
     itmp = data%mk+3 ! additional array used for temporary variables
   end select

   ! compute the tendency: store it in work(irhs)
   call ode%rhs(ti%work(irhs),t,unow,ods)

   ! Notice that the same array is used as fx0 and vvv; also, the
   ! fields of ti are unpacked to avoid aliasing the intent(out)
   ! components.
   call erb2kry_step_internals(unew,ods , ti%work(:ikry),ti%data , &
                ode,t , unow,ti%work(irhs),ti%work(irhs) , ti%dt , &
                ti%work(itmp) , step_diag)

   call unew%incr(unow)

 end subroutine erb2kry_step

!-----------------------------------------------------------------------
 
 !> Provide the actual implementation for \c erb2kry_step
 !!
 !! The reason for providing this subroutine as a separate one from \c
 !! erb2kry_step is to have a more general interface compared with the
 !! unified interface of the time integrators. This allows using this
 !! subroutine as a building block for more complex time integrators.
 !!
 !! \note This function should be relevant only for the implementation
 !! of new time integrators; from the user viewpoint \c erb2kry_step
 !! should be enough.
 !!
 !! More in detail, this function computes
 !! \f{displaymath}{
 !!  {\bf w}=\Delta t\,\varphi\left(\Delta t\,J_{{\bf f}_s}({\bf
 !!  x}_0)\right){\bf v}
 !! \f}
 !! where \f${\bf f}_s\f$ is the function specified by
 !! <tt>ode\%rhs</tt> and <tt>term</tt>, and \f${\bf x}_0\f$ and
 !! \f${\bf v}\f$ are two arbitrary vectors. An additional argument
 !! <tt>fx0</tt> is introduced containing \f${\bf f}_s({\bf x}_0\f$
 !! since this can often save some computations: in fact <tt>vvv</tt>
 !! and <tt>fx0</tt> are often two aliases to the same object (note
 !! that both of them have intent in).
 !!
 !! See the documentation for \c i_rhs in \c c_ode for a full
 !! description concerning the use of \c term to specify \f${\bf
 !! f}_s\f$.
 !!
 !! \note The time step \c dt can be arbitrary.
 subroutine erb2kry_step_internals(www,ods , kry,data , ode,t , &
                        x0,fx0,vvv , dt , tmp , step_diag , term)
  class(c_stv), intent(inout) :: kry(:) !< Krylov space
  class(c_expo_data), intent(inout) :: data
  class(c_ode), intent(in)    :: ode
  real(wp),     intent(in)    :: t
  class(c_stv), intent(in)    :: x0, fx0, vvv
  real(wp),     intent(in)    :: dt
  class(c_stv), intent(inout) :: tmp !< work variable for directional
  class(c_stv), intent(inout) :: www !< output (overwritten)
  class(c_ods), intent(inout) :: ods !< scratch storage
  type(t_ti_step_diag), optional, intent(inout) :: step_diag
  integer,      optional, intent(in) :: term(2)

  integer :: mk, k, l, ierr
  real(wp) :: beta, betai, h_kp1_k, err
  real(wp), allocatable :: errnorms(:)
  logical :: master, stop_krylov
  character(len=*), parameter :: &
    this_sub_name = 'erb2kry_step_internals'

   select type(data); type is(t_expokry_data)

   master = data%mpi_id.eq.0

   if(master.and.present(step_diag)) allocate(errnorms(data%mk))

   ! get the dimension of the Krylov space
   mk = data%mk

   ! Normalization of vvv
   beta = sqrt( vvv%scal( vvv ) )
   betai = 1.0_wp/beta

   ! compute first Krylov vector (rescaled vvv)
   call kry(1)%mlt(betai,vvv)
   
   ! build the Krylov space
   data%h = 0.0_wp
   krylov_do: do k=1,mk

     ! new Krylov vector
     call directional( kry(k+1) , ode,t,x0,fx0,kry(k),ods , &
                       tmp , term=term )

     ! Gram-Schmidt process
     do l=1,k
       data%h(l,k) = kry(k+1)%scal( kry(l) )
       call kry(k+1)%inlt( -data%h(l,k) , kry(l) )
     enddo
     h_kp1_k = sqrt( kry(k+1)%scal( kry(k+1) ) )

     ! The master thread computes the exponential, checks the residual
     ! and communicates whether new iterations are required or not.
     if(master) then
       ! add one column and one row to H_m
       data%h(1:k,k+1) = data%eye(1:k,1)
       ! Compute ex = exp(dt Hbar) for the augmented matrix Hbar
       ! Note that:
       ! a) data%h(k+1,:) is already zero
       ! b) the coefficient dt is included both for the matrix H and
       !   for the additional column e1, so that the final result is
       !   dt*phi(dt*H)*e1 in the (k+1)-th column
       call expa( dt*data%h(:k+1,:k+1) ,                   &
                  data%ex(:k+1,:k+1) , data%eye(:k+1,:k+1) )
       err = dt*h_kp1_k*abs(data%ex(k,k+1))
       stop_krylov = err.le.data%kry_tol
       ! In the diagnostic, include beta so that the error is
       ! homogeneous to phi(dt*A)*f(u^n)
       if(present(step_diag)) errnorms(k) = beta*err
     endif
     call mpi_bcast(stop_krylov,1,mpi_logical,0,data%mpi_comm,ierr)
     if(stop_krylov) exit krylov_do

     ! normalize the new vector
     if(k.lt.mk) then
       data%h(k+1,k) = h_kp1_k
       call kry(k+1)%tims( 1.0_wp/data%h(k+1,k) )
     endif

   enddo krylov_do
   if(master.and.present(step_diag)) step_diag%max_iter = k.gt.mk
   mk = min(k,mk) ! when reaching the end of the do loop k=mk+1
   call mpi_bcast(data%ex(1,mk+1),mk,wp_mpi,0,data%mpi_comm,ierr)

   ! Linear combination: beta *( linear combination )
   call www%lcb( data%ex(1:mk,mk+1) , kry(1:mk) )
   call www%tims(beta)

   if(present(step_diag)) then
     if(master) then
       step_diag%iterations = mk
       if(allocated(step_diag%d1)) deallocate(step_diag%d1)
       allocate(step_diag%d1(mk)); step_diag%d1 = errnorms(:mk)
       if(allocated(step_diag%d2)) deallocate(step_diag%d2)
       allocate(step_diag%d2(mk,mk)); step_diag%d2 = data%h(:mk,:mk)
     else
       step_diag%iterations = -1
     endif
   endif

   if(allocated(errnorms)) deallocate(errnorms)

   end select

 end subroutine erb2kry_step_internals
 
!-----------------------------------------------------------------------

 pure subroutine erb2kry_clean(ti,ode,step_diag)
  class(c_ode),  intent(in)    :: ode
  class(t_erb2kry), intent(inout) :: ti
  type(t_ti_step_diag), optional, intent(inout) :: step_diag

  character(len=*), parameter :: &
    this_sub_name = 'erb2kry_clean'

  deallocate(ti%work,ti%data)
  if(present(step_diag)) deallocate(step_diag%d1 , step_diag%d2)

 end subroutine erb2kry_clean

!-----------------------------------------------------------------------
 
 !> Setup for the exponential integrator with Leja points.
 !!
 !! Use <tt>init_data\%ir1(1:2)</tt> to provide estimates of the real
 !! eigenvalues of the Jacobian matrix \f$\frac{d{\bf f}}{d{\bf
 !! u}}\f$, <em>without including the time step</em>. In this
 !! function, this interval is multiplied by \c dt to obtain the
 !! eigenvalues of \f$\Delta t\frac{d{\bf f}}{d{\bf u}}\f$ which
 !! define the interval effectively used for interpolation.
 !!
 !! To compute the divided differences, we could use the standard
 !! definition, This however turns out to be extremely unstable due to
 !! rounding errors, so that only few terms in the Newton polynomial
 !! could be used. A much stable algorithm is obtained following <a
 !! href="http://dx.doi.org/10.1007/s00607-007-0227-1">[Caliari,
 !! Computing, 2007]</a>, section 3, and <a
 !! href="http://dx.doi.org/10.1002/fld.1902">[Schulze, Schmid,
 !! Sesterhenn, Int. J. Num. Met. Fluids, 2009]</a>, equation (16), so
 !! that we compute the divided differences as
 !! \f{displaymath}{
 !!  \hat{\varphi}[\xi_0,\ldots,\xi_k] =
 !!  \left(\hat{\varphi}(H_n)\,{\bf e}_1\right)_k
 !! \f}
 !! with
 !! \f{displaymath}{
 !!  H_n = \left[\begin{array}{ccccc}
 !!   \xi_0 \\
 !!    1 & \xi_1 \\
 !!   & 1 & \ddots \\
 !!   & & \ddots & \ddots \\
 !!    & & & 1 & \xi_n
 !!  \end{array}\right]
 !! \f}
 !! and evaluate \f$\hat{\varphi}(H_n)\,{\bf e}_1\f$ exploiting
 !! \f$\hat{\varphi}(H_n)\,{\bf e}_1 = \varphi(\gamma H_n+cI)\,{\bf
 !! e}_1\f$ and
 !! \f{displaymath}{
 !!  \exp\left(\left[\begin{array}{cc}
 !!   \gamma H_n+cI & {\bf e}_1 \\ {\bf 0}^T & 0
 !!  \end{array}\right]\right) =
 !!  \left[\begin{array}{cc}
 !!   \exp(\gamma H_n+cI) & \varphi(\gamma H_n+cI)\,{\bf e}_1 \\ {\bf
 !!   0}^T & 1
 !!  \end{array}\right],
 !! \f}
 !! which allows to use the highly optimized function expa().
 subroutine erb2lej_init(ti,ode,dt,t,u0,ods,init_data,step_diag)
  class(c_ode),  intent(in)    :: ode
  real(wp),      intent(in)    :: dt, t
  class(c_stv),  intent(inout) :: u0
  class(c_ods),  intent(inout) :: ods
  class(t_erb2lej), intent(out) :: ti
  type(t_ti_init_data), optional, intent(in)  :: init_data
  type(t_ti_step_diag), optional, intent(out) :: step_diag
 
  integer, parameter :: &
    nx = 10000 ! equally spaced points to compute the Leja points
  integer :: i, ierr, nmax
  real(wp) :: aa, bb
  real(wp), allocatable :: x(:), omega(:), mat(:,:), eye(:,:), exm(:,:)
  character(len=*), parameter :: &
    this_sub_name = 'erb2lej_init'

   allocate(t_expolej_data::ti%data)
   select type(data=>ti%data); type is(t_expolej_data)

   ti%dt = dt

   aa = 1.0_wp; bb=-1.0_wp ! redefined later
   if(present(init_data)) then
     data%nmax     = init_data%dim
     data%leja_tol = init_data%tol
     if(allocated(init_data%ir1)) then
       aa = dt*minval(init_data%ir1(1:2))
       bb = dt*maxval(init_data%ir1(1:2))
     endif
     data%mpi_id   = init_data%mpi_id
     data%mpi_comm = init_data%mpi_comm
   endif
   nmax = data%nmax ! simplify the following code

   ! Define the spectral focal interval
   if(aa.gt.bb) then ! no values have been provided
     call warning(this_sub_name,this_mod_name, &
       'To use the Leja point interpolation you should specify '// &
       'the limits of the spectral focal interval using the '   // &
       'input argument "init_data%ir1(1:2)". The default values'// &
       ' [-1,1] will likely result in very poor convergence.')
     aa = -1.0_wp; bb = 1.0_wp
   endif
   data%cc = (aa+bb)/2.0_wp
   data%gg = (bb-aa)/4.0_wp

   allocate( data%xi(0:nmax) , data%dd(0:nmax) )
   ! Compute the Leja points and the divided differences
   if(data%mpi_id.eq.0) then
     
     ! 1) Compute the Leja points on [-2,2]
     allocate(x(nx),omega(nx))
     x = (/ (4.0_wp*(real(i,wp)/real(nx-1,wp))-2.0_wp , i=0,nx-1) /)
     data%xi(0) = 2.0_wp ! first Leja point, with maximum abs
     omega = abs(x-data%xi(0)) ! initialize the nodal polynomial
     do i=1,nmax
       data%xi(i) = x(maxloc(omega,1)+lbound(omega,1)-1)
       omega = abs(x-data%xi(i)) * omega
     enddo
     deallocate(omega,x)
     
     ! Now compute the divided differences
     allocate( mat(nmax+2,nmax+2) , &
               eye(nmax+2,nmax+2) , &
               exm(nmax+2,nmax+2) )
     mat = 0.0_wp; eye = 0.0_wp
     do i=1,nmax+1
       mat(i,i) = data%gg*data%xi(i-1) + data%cc ! diagonal
     enddo
     do i=2,nmax+1
       mat(i,i-1) = data%gg ! subdiagonal
     enddo
     mat(1,nmax+2) = 1.0_wp ! last column
     do i=1,nmax+2
       eye(i,i) = 1.0_wp
     enddo
     call expa(mat,exm,eye)
     data%dd = exm(1:nmax+1,nmax+2)
     deallocate( mat , eye , exm )

     ! This is the standard definition of the divided differences;
     ! however this computation is highly unstable due to rounding
     ! errors.
     !data%dd = phi(data%xi)
     !do i=1,nmax
     !  do k=nmax,i,-1
     !    data%dd(k) = (data%dd(k)-data%dd(k-1)) &
     !                /(data%xi(k)-data%xi(k-i))
     !  enddo
     !enddo
   endif
   call mpi_bcast(data%xi(0),nmax+1,wp_mpi,0,data%mpi_comm,ierr)
   call mpi_bcast(data%dd(0),nmax+1,wp_mpi,0,data%mpi_comm,ierr)

   call u0%source_vect(ti%work,4)

   ! Diagnostics
   if(present(step_diag)) then
     step_diag%name_d1 = 'ti_res'
     step_diag%name_d2 = 'NewtonDD'
     if(data%mpi_id.eq.0) then
       allocate(step_diag%d1(0:nmax) , step_diag%d2(4,0:nmax))
       step_diag%d1 = -1.0_wp ! undefined values
       ! d2 is used to return the Leja points and the divided
       ! differences; these values never change during the time
       ! integration and this diagnostic is not used anymore.
       step_diag%d2(1,:) = data%xi
       step_diag%d2(2,:) = ((bb-aa)/4.0_wp)*data%xi + (aa+bb)/2.0_wp
       step_diag%d2(3,:) = phi(data%xi)
       step_diag%d2(4,:) = data%dd
     endif
   endif

   end select

 contains

  ! See [Caliari, Computing, 2007], section 2.
  elemental function phi(z)
   real(wp), intent(in) :: z
   real(wp) :: phi

   real(wp), parameter :: eps = epsilon(0.0_wp)

    if(abs(z).lt.eps) then
      phi = 1.0_wp
    elseif(abs(z).lt.1.0_wp) then
      phi = (exp(z)-1.0_wp)/(log(exp(z)))
    else
      phi = (exp(z)-1.0_wp)/z
    endif

  end function phi

 end subroutine erb2lej_init
 
!-----------------------------------------------------------------------

 !> Matrix exponential with interpolation at the real Leja points.
 subroutine erb2lej_step(unew,ti,ode,t,unow,ods,step_diag)
  class(t_erb2lej), intent(inout) :: ti
  class(c_ode),  intent(in)    :: ode
  real(wp),      intent(in)    :: t
  class(c_stv),  intent(in)    :: unow
  class(c_ods),  intent(inout) :: ods
  class(c_stv),  intent(inout) :: unew
  type(t_ti_step_diag), optional, intent(inout) :: step_diag

  integer, parameter :: &
    irhs = 3, & ! index in work of f(u^n)
    itmp = 4    ! index in work of the temporary storage

   ! compute the tendency: store it in work(irhs)
   call ode%rhs(ti%work(irhs),t,unow,ods)

   ! Notice that the same array is used as fx0 and vvv
   call erb2lej_step_internals(unew,ods , ti%work(:2),ti%data , &
             ode,t , unow,ti%work(irhs),ti%work(irhs) , ti%dt , &
             ti%work(itmp) , step_diag)

   call unew%incr(unow)

 end subroutine erb2lej_step
 
!-----------------------------------------------------------------------

 !> Provide the actual implementation for \c erb2lej_step
 !!
 !! All the comments in \c erb2kry_step_internals also apply to this
 !! subroutine.
 !!
 !! \note The name \c kry for the dummy argument is chosen for
 !! compatibiliy with \c erb2lej_step_internals, even though it simply
 !! means a working array for the construction of theinterpolation
 !! polynomial. It should have (at least) two elements.
 subroutine erb2lej_step_internals(www,ods , kry,data , ode,t , &
                        x0,fx0,vvv , dt , tmp , step_diag , term)
  class(c_stv), intent(inout) :: kry(:) !< construct the interpolant
  class(c_expo_data), intent(inout) :: data
  class(c_ode), intent(in)    :: ode
  real(wp),     intent(in)    :: t
  class(c_stv), intent(in)    :: x0, fx0, vvv
  real(wp),     intent(in)    :: dt
  class(c_stv), intent(inout) :: tmp
  class(c_stv), intent(inout) :: www
  class(c_ods), intent(inout) :: ods !< scratch storage
  type(t_ti_step_diag), optional, intent(inout) :: step_diag
  integer,      optional, intent(in) :: term(2)

  integer, parameter :: &
    ncheck = 4   ! convergence test
  integer :: k, wa,wb,wc, ierr
  real(wp) :: n2w(ncheck), n2w0, nx0, nvvv, cgxi
  real(wp), allocatable :: errnorms(:)
  logical :: master, stop_points
  character(len=*), parameter :: &
    this_sub_name = 'erb2lej_step_internals'

   select type(data); type is(t_expolej_data)

   master = data%mpi_id.eq.0
   n2w = huge(0.0_wp) ! make sure we make at least ncheck iterations

   if(master.and.present(step_diag)) allocate(errnorms(data%nmax+1))

   !-------------------------------------------------------------
   ! Compute   w = phi(dt A) v
   !-------------------------------------------------------------

   ! norm of x0, used for the discrete Jacobian
   nx0 = sqrt( x0%scal( x0 ))

   ! first term in the Newton polynomial
   call www%mlt( data%dd(0) , vvv )

   ! Scaling coefficients for convergence test and Jacobian matrix
   nvvv = sqrt( vvv%scal( vvv ))
   n2w0 = abs(data%dd(0))*nvvv ! same as ||www||
   if(master.and.present(step_diag)) errnorms(1) = n2w0

   ! Note: in the following, work(wa) is initialized with f(u^n) and
   ! at each iteration it is equal to (A-z_{i-2})...(A-z_0)f(u^n) .

   ! subsequent terms in the Newton polynomial
   wa = 1; wb = 2
   call kry(wa)%copy( vvv )
   points_do: do k=1,data%nmax

     ! Compute the new matrix-vector product: wb = A*wa - xik*wa
     call directional( kry(wb) , ode,t,x0,fx0,kry(wa),ods , &
                       tmp , normx0=nx0,normv=nvvv , term=term )
     call kry(wb)%tims( dt/data%gg ) ! include dt/gamma
     cgxi = data%cc/data%gg + data%xi(k-1)
     call kry(wb)%inlt( -cgxi , kry(wa) )

     ! Update the Newton polynomial
     call www%inlt( data%dd(k) , kry(wb) )

     ! Scaling coefficients for convergence test and Jacobian matrix
     nvvv = sqrt( kry(wb)%scal( kry(wb) ) )
     n2w(2:ncheck) = n2w(1:ncheck-1) ! previous iterations
     n2w(1) = abs(data%dd(k))*nvvv

     ! Update the norm of the computed value  phi(dt*A)*v
     n2w0 = sqrt( www%scal( www ) )
     if(master) then
       stop_points = all( n2w .le. data%leja_tol*n2w0 )
       if(present(step_diag)) errnorms(k+1) = n2w(1)
     endif
     call mpi_bcast(stop_points,1,mpi_logical,0,data%mpi_comm,ierr)
     if(stop_points) exit points_do

     ! Swap indexes in the work arrays
     wc = wb; wb = wa; wa = wc

   enddo points_do

   if(present(step_diag)) then
     if(master) then
       step_diag%max_iter = .not.stop_points
       if(step_diag%max_iter) k = k-1 ! k incremented on exit
       step_diag%iterations = k
       if(allocated(step_diag%d1)) deallocate(step_diag%d1)
       allocate(step_diag%d1(0:k)); step_diag%d1 = errnorms(:k+1)
     else
       step_diag%iterations = -1
     endif
   endif
   if(allocated(errnorms)) deallocate(errnorms)

   !-------------------------------------------------------------
   ! Compute   u^{n+1} = dt u^{n+1}
   !-------------------------------------------------------------

   call www%tims(dt)

   end select

 end subroutine erb2lej_step_internals
 
!-----------------------------------------------------------------------

 pure subroutine erb2lej_clean(ti,ode,step_diag)
  class(c_ode),  intent(in)    :: ode
  class(t_erb2lej), intent(inout) :: ti
  type(t_ti_step_diag), optional, intent(inout) :: step_diag

  character(len=*), parameter :: &
    this_sub_name = 'erb2lej_clean'

  deallocate(ti%work,ti%data)
  if(present(step_diag)) then
    if(allocated(step_diag%d1)) deallocate(step_diag%d1)
    if(allocated(step_diag%d2)) deallocate(step_diag%d2)
  endif

 end subroutine erb2lej_clean

!-----------------------------------------------------------------------

 !> Computation of the full matrix exponential
 !!
 !! Compute the matrix exponential of a full matrix by scaling and
 !! squaring and Pad&eacute; approximation. Notice that it is not
 !! required that the matrix be in Hessenberg form; notice as well
 !! that the algorithm is well defined also for singular matrices. As
 !! described in detail in <a
 !! href="http://www.jstor.org/stable/25054364">[Moler, Van Loan, SIAM
 !! Review, 2003]</a>, <a
 !! href="http://dx.doi.org/10.1145/285861.285868">[Sidje, ACM
 !! Transactions on Mathematical Software, 1998]</a> and <a
 !! href="http://dx.doi.org/10.1137/04061101X">[Higham, SIAM J. Matrix
 !! Anal. Appl., 2005]</a>, the \f$(p,q)\f$ Pad&eacute; approximant of
 !! the exponential \f$\exp{(x)}\f$ is defined as the only rational
 !! function \f$P_{p,q}(x)=N_{p,q}(x)/D_{p,q}(x)\f$ such that for
 !! small \f$x\f$
 !! \f{displaymath}{
 !!  \exp{(x)} - P_{p,q}(x) = \mathcal{O}(x^{p+q+1}).
 !! \f}
 !! For practical computations, there are some reasons in favour of
 !! choosing \f$p=q\f$. Furthermore, for the exponential function it
 !! can be proven that \f$D_{p,p}(x)=N_{p,p}(-x)\f$ and that
 !! \f$N_{p,p}(x)=\sum_{k=0}^pc_kx^k\f$ with \f$c_0=1\f$ and
 !! \f$c_k=c_{k-1}(p+1-k)/(2pk +k -k^2)\f$, for \f$k=1,\dots,p\f$.
 !! For the present implementation, we follow <a
 !! href="http://dx.doi.org/10.1137/04061101X">[Higham, SIAM J. Matrix
 !! Anal. Appl., 2005]</a> and <a
 !! href="http://eigen.tuxfamily.org/index.php?title=Main_Page">[Niesen,
 !! <em>Eigen</em>, a lightweight C++ template library for linear
 !! algebra]</a> for the optimal choice of the degree \f$p\f$ and
 !! further optimizations of the computational cost. The main
 !! difficulty with the Pad&eacute; approximation technique is that it
 !! only yields an accurate approximation for matrices \f${\bf A}\f$
 !! whose operator norm \f$\| {\bf A} \|= \sup_{\|{\bf x}\|=1} \|{\bf
 !! Ax} \|\f$ is small. For this reason, this technique is combined
 !! with a <em>scaling and squaring</em> algorithm. This amounts to
 !! exploiting the natural property 
 !! \f{displaymath}{
 !!  \exp{({\bf A})} = \exp{(2^{-s}{\bf A})}^{2^s}
 !! \f}
 !! of the exponential function, where \f$s\f$ is a positive integer.
 !! Thus, the combination of Pad&eacute; approximation and scaling and
 !! squaring consists in determining
 !! \f{displaymath}{
 !!  \left(P_{p,p}(2^{-s}{\bf A})\right)^{2^s} \approx \exp{({\bf A})}.
 !! \f}
 !! The optimal algorithm thus has to leverage on the two parameters
 !! \f$s\f$ and \f$p\f$ to achieve a machine accurate approximation of
 !! the exponential, while reducing the computational cost as well as
 !! the round-off errors. In this implementation, we closely follow
 !! <a
 !! href="http://dx.doi.org/10.1137/04061101X">[Higham, SIAM J. Matrix
 !! Anal. Appl., 2005]</a> and <tt>MatrixExponential.h</tt> in <a
 !! href="http://eigen.tuxfamily.org/index.php?title=Main_Page">[Niesen,
 !! <em>Eigen</em>, a lightweight C++ template library for linear
 !! algebra]</a>. The resulting algorithm works as follows:
 !! <ul>
 !!  <li> compute \f$\|A\|_1\f$ (indeed, any matrix norm could be used)
 !!  <li> if the matrix norm is lower than a precomputed constant, no
 !!  scaling is introduced and the required accuracy is obtained by
 !!  choosing \f$p\f$
 !!  <li> if the matrix norm is large, than the required \f$p\f$ would
 !!  be too large in absence of scaling and squaring, so \f$p\f$ is
 !!  set to the maximum available value and the desired tolerance is
 !!  obtained selecting \f$s>0\f$
 !!  <li> the algorithm the computes two matrices \f$U\f$ and \f$V\f$
 !!  corresponding to the odd and even terms in the Pad&eacute;
 !!  expansion, respectively
 !!  <li> the two matrice \f$U\f$ and \f$V\f$ are combined to compute
 !!  the exponential.
 !! </ul>
 pure subroutine expa(aa,ex,eye)
  real(wp), intent(in) :: aa(:,:)  !< we compute \f$exp(A)\f$
  real(wp), intent(in) :: eye(:,:) !< identity matrix
  real(wp), intent(out) :: ex(:,:) !< matrix exponential

  real(wp), parameter :: &
    threshold_double(4) = (/ &
      1.495585217958292e-002_wp , &
      2.539398330063230e-001_wp , &
      9.504178996162932e-001_wp , &
      2.097847961257068e+000_wp /), &
    threshold_quad(5)   = (/ &
      1.639394610288918690547467954466970e-005_wp , &
      4.253237712165275566025884344433009e-003_wp , &
      5.125804063165764409885122032933142e-002_wp , &
      2.170000765161155195453205651889853e-001_wp , &
      1.125358383453143065081397882891878e+000_wp /), &
    maxnorm_double = 5.371920351148152_wp, &
    maxnorm_quad   = 2.884233277829519311757165057717815_wp

  integer :: ns, k
  real(wp) :: hnorm
  real(wp), allocatable :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)

   k = size(aa,1)
   allocate( uu(k,k) , vv(k,k) , tmp1(k,k) , tmp2(k,k) )

   !-------------------------------------------------------------
   ! matrix norm
   !-------------------------------------------------------------
   hnorm = maxval( sum(abs(aa),2) ) ! 1-norm

   !-------------------------------------------------------------
   ! compute U and V (and choose ns)
   !-------------------------------------------------------------
   ns = 0
   if(wp_p.le.16) then ! double precision
     if(hnorm.lt.threshold_double(1)) then
       call pade3(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_double(2)) then
       call pade5(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_double(3)) then
       call pade7(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_double(4)) then
       call pade9(uu,vv,aa,eye,tmp1,tmp2)
     else
       ns = max( 0 , ceiling(log(hnorm/maxnorm_double)/log(2.0_wp)) )
       call pade13(uu,vv,aa/real(2**ns,wp),eye,tmp1,tmp2)
     endif
   else ! quad precision
     if(hnorm.lt.threshold_quad(1)) then
       call pade3(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_quad(2)) then
       call pade5(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_quad(3)) then
       call pade7(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_quad(4)) then
       call pade9(uu,vv,aa,eye,tmp1,tmp2)
     elseif(hnorm.lt.threshold_quad(5)) then
       call pade13(uu,vv,aa,eye,tmp1,tmp2)
     else
       ns = max( 0 , ceiling(log(hnorm/maxnorm_quad)/log(2.0_wp)) )
       call pade17(uu,vv,aa/real(2**ns,wp),eye,tmp1,tmp2)
     endif
   endif

   !-------------------------------------------------------------
   ! compute the exponential matrix
   !-------------------------------------------------------------
   tmp1 =  uu + vv ! numerator of Pade approximant
   tmp2 = -uu + vv ! denominator of Pade approximant
   call invmat(tmp2,ex)
   ex = matmul(ex,tmp1)

   !-------------------------------------------------------------
   !  squaring to get final approximation of exponential
   !-------------------------------------------------------------
   do k=1,ns
     ex = matmul(ex,ex)
   enddo

   deallocate( uu , vv , tmp1 , tmp2 )
  contains

   ! Note: the coefficients are normalized so that they are integer
   ! values - the largest coefficients however would require
   ! integer*16 variables, which are available in gfortran but not in
   ! ifort (as for version 13). For this reason, we treat the
   ! coefficients as real(wp) literal constants.

   pure subroutine pade3(uu,vv,aa,eye,tmp1,tmp2)
    real(wp), intent(out) :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)
    real(wp), intent(in) :: aa(:,:), eye(:,:)

    real(wp), parameter :: &
      c(0:3) = (/ 120.0_wp , 60.0_wp , 12.0_wp , 1.0_wp /)

     tmp1 = matmul(aa,aa)
     tmp2 = c(3)*tmp1 + c(1)*eye
     uu = matmul(aa,tmp2)
     vv = c(2)*tmp1 + c(0)*eye
   end subroutine pade3

   pure subroutine pade5(uu,vv,aa,eye,tmp1,tmp2)
    real(wp), intent(out) :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)
    real(wp), intent(in) :: aa(:,:), eye(:,:)

    real(wp), parameter :: &
      c(0:5) = (/ &
        30240.0_wp , &
        15120.0_wp , &
         3360.0_wp , &
          420.0_wp , &
           30.0_wp , &
            1.0_wp /)
    real(wp), allocatable :: a2(:,:)

     allocate(a2(size(aa,1),size(aa,2)))
     a2 = matmul(aa,aa)
     tmp1 = matmul(a2,a2)
     tmp2 = c(5)*tmp1 + c(3)*a2 + c(1)*eye
     uu = matmul(aa,tmp2)
     vv = c(4)*tmp1 + c(2)*a2 + c(0)*eye
     deallocate(a2)

   end subroutine pade5

   pure subroutine pade7(uu,vv,aa,eye,tmp1,tmp2)
    real(wp), intent(out) :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)
    real(wp), intent(in) :: aa(:,:), eye(:,:)

    real(wp), parameter :: &
      c(0:7) = (/ &
        17297280.0_wp , &
         8648640.0_wp , &
         1995840.0_wp , &
          277200.0_wp , &
           25200.0_wp , &
            1512.0_wp , &
              56.0_wp , &
               1.0_wp /)
    real(wp), allocatable :: a2(:,:), a4(:,:)

     allocate(a2(size(aa,1),size(aa,2)),a4(size(aa,1),size(aa,2)))
     a2 = matmul(aa,aa)
     a4 = matmul(a2,a2)
     tmp1 = matmul(a4,a2)
     tmp2 = c(7)*tmp1 + c(5)*a4 + c(3)*a2 + c(1)*eye
     uu = matmul(aa,tmp2)
     vv = c(6)*tmp1 + c(4)*a4 + c(2)*a2 + c(0)*eye
     deallocate(a2,a4)

   end subroutine pade7

   pure subroutine pade9(uu,vv,aa,eye,tmp1,tmp2)
    real(wp), intent(out) :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)
    real(wp), intent(in) :: aa(:,:), eye(:,:)

    real(wp), parameter :: &
      c(0:9) = (/ &
        17643225600.0_wp , &
         8821612800.0_wp , &
         2075673600.0_wp , &
          302702400.0_wp , &
           30270240.0_wp , &
            2162160.0_wp , &
             110880.0_wp , &
               3960.0_wp , &
                 90.0_wp , &
                  1.0_wp /)
    real(wp), allocatable :: a2(:,:), a4(:,:), a6(:,:)

     allocate(a2(size(aa,1),size(aa,2)),a4(size(aa,1),size(aa,2)), &
              a6(size(aa,1),size(aa,2)))
     a2 = matmul(aa,aa)
     a4 = matmul(a2,a2)
     a6 = matmul(a4,a2)
     tmp1 = matmul(a4,a4)
     tmp2 = c(9)*tmp1 + c(7)*a6 + c(5)*a4 + c(3)*a2 + c(1)*eye
     uu = matmul(aa,tmp2)
     vv = c(8)*tmp1 + c(6)*a6 + c(4)*a4 + c(2)*a2 + c(0)*eye
     deallocate(a2,a4,a6)

   end subroutine pade9

   pure subroutine pade13(uu,vv,aa,eye,tmp1,tmp2)
    real(wp), intent(out) :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)
    real(wp), intent(in) :: aa(:,:), eye(:,:)

    real(wp), parameter :: &
      c(0:13) = (/ &
        64764752532480000.0_wp , &
        32382376266240000.0_wp , &
         7771770303897600.0_wp , &
         1187353796428800.0_wp , &
          129060195264000.0_wp , &
           10559470521600.0_wp , &
             670442572800.0_wp , &
              33522128640.0_wp , &
               1323241920.0_wp , &
                 40840800.0_wp , &
                   960960.0_wp , &
                    16380.0_wp , &
                      182.0_wp , &
                        1.0_wp /)
    real(wp), allocatable :: a2(:,:), a4(:,:)

     allocate(a2(size(aa,1),size(aa,2)),a4(size(aa,1),size(aa,2)))
     a2 = matmul(aa,aa)
     a4 = matmul(a2,a2)
     tmp1 = matmul(a4,a2)
     vv = c(13)*tmp1 + c(11)*a4 + c(9)*a2 ! used for temporary storage
     tmp2 = matmul(tmp1,vv) + c(7)*tmp1 + c(5)*a4 + c(3)*a2 + c(1)*eye
     uu = matmul(aa,tmp2)
     tmp2 = c(12)*tmp1 + c(10)*a4 + c(8)*a2
     vv = matmul(tmp1,tmp2) + c(6)*tmp1 + c(4)*a4 + c(2)*a2 + c(0)*eye
     deallocate(a2,a4)

   end subroutine pade13

   pure subroutine pade17(uu,vv,aa,eye,tmp1,tmp2)
    real(wp), intent(out) :: uu(:,:), vv(:,:), tmp1(:,:), tmp2(:,:)
    real(wp), intent(in) :: aa(:,:), eye(:,:)

    real(wp), parameter :: &
      c(0:17) = (/ &
        830034394580628357120000.0_wp , &
        415017197290314178560000.0_wp , &
        100610229646136770560000.0_wp , &
         15720348382208870400000.0_wp , &
          1774878043152614400000.0_wp , &
           153822763739893248000.0_wp , &
            10608466464820224000.0_wp , &
              595373117923584000.0_wp , &
               27563570274240000.0_wp , &
                1060137318240000.0_wp , &
                  33924394183680.0_wp , &
                    899510451840.0_wp , &
                     19554575040.0_wp , &
                       341863200.0_wp , &
                         4651200.0_wp , &
                           46512.0_wp , &
                             306.0_wp , &
                               1.0_wp /)
    real(wp), allocatable :: a2(:,:), a4(:,:), a6(:,:)

     allocate(a2(size(aa,1),size(aa,2)),a4(size(aa,1),size(aa,2)), &
              a6(size(aa,1),size(aa,2)))
     a2 = matmul(aa,aa)
     a4 = matmul(a2,a2)
     a6 = matmul(a4,a2)
     tmp1 = matmul(a4,a4)
     vv = c(17)*tmp1 + c(15)*a6 + c(13)*a4 + c(11)*a2 ! temp. storage
     tmp2 = matmul(tmp1,vv) + c(9)*tmp1 + c(7)*a6 + c(5)*a4 + c(3)*a2 &
           + c(1)*eye
     uu = matmul(aa,tmp2)
     tmp2 = c(16)*tmp1 + c(14)*a6 + c(12)*a4 + c(10)*a2
     vv = matmul(tmp1,tmp2) + c(8)*tmp1 + c(6)*a6 + c(4)*a4 + c(2)*a2 &
           + c(0)*eye
     deallocate(a2,a4,a6)

   end subroutine pade17

 end subroutine expa

!-----------------------------------------------------------------------
 
 !> Computation of \f$J_{{\bf f}}{\bf v}\f$ for a generic vector
 !! \f${\bf v}\f$ 
 !!
 !! Given the vector function \f${\bf f}(t,{\bf x})\f$ and the vector
 !! \f$\bf v\f$, this subroutine computes the finite difference
 !! numerical approximation of
 !! \f{displaymath}{
 !!  \frac{\partial \bf f}{\partial \bf x}(t_0,{\bf x}_0) {\bf v}
 !! \f}
 !! defined by
 !! \f{displaymath}{
 !!  \frac{{\bf f}(t_0,{\bf x}_0 +\epsilon{\bf v})-{\bf f}(t_0,{\bf
 !!  x}_0)}{\epsilon}.
 !! \f}
 !! \note An input argument \c fx0 is included since typically the
 !! quantity \f${\bf f}(t_0,{\bf x}_0)\f$ can be reused among many
 !! calls to this function.
 !!
 !! Two input arguments are included to tune the parameter
 !! \f$\epsilon\f$ according to the norm of \f${\bf x}_0\f$ and \f${\bf
 !! v}\f$. If they are present, the discrete derivative is computed as
 !! \f{displaymath}{
 !!  \frac{{\bf f}(t_0,{\bf x}_0 +\tilde{\epsilon}{\bf v})-{\bf
 !!  f}(t_0,{\bf x}_0)}{\tilde{\epsilon}}.
 !! \f}
 !! with
 !! \f{displaymath}{
 !!  \tilde{\epsilon} = \frac{\|{\bf x}_0\|}{\|{\bf v}\|}\epsilon.
 !! \f}
 !! The reason while such parameters are not computed inside this
 !! subroutine is that they could result in a significant overhead,
 !! often not required.
 subroutine directional(jv,ode,t,x0,fx0,v,ods,tmp,normx0,normv,term)
  !> <tt>ode%rhs</tt> is the function \f${\bf f}\f$ to be
  !! differentiated
  class(c_ode), intent(in)    :: ode
  !> present time level \f$t_0\f$
  real(wp),     intent(in)    :: t
  !> present state \f${\bf x}_0\f$
  class(c_stv), intent(in)    :: x0
  !> present value \f${\bf f}(t_0,{\bf x}_0)\f$
  class(c_stv), intent(in)    :: fx0
  !> direction vector \f${\bf v}\f$
  class(c_stv), intent(in)    :: v
  !> directional derivative \f$J_{{\bf f}}{\bf v}\f$
  class(c_stv), intent(inout) :: jv
  !> scratch storage
  class(c_ods), intent(inout) :: ods
  !> work variable, used to avoid many allocations/deallocations
  class(c_stv), intent(inout) :: tmp
  real(wp), optional, intent(in) :: normx0 !< \f$\|{\bf x}_0\|\f$
  real(wp), optional, intent(in) :: normv  !< \f$\|{\bf v}\|\f$
  integer,  optional, intent(in) :: term(2) !< see \c i_rhs

  real(wp), parameter :: epsi = 1.0e-6_wp
  real(wp) :: epst, iepst

  character(len=*), parameter :: &
    this_sub_name = 'directional'
  
   if(present(normx0)) then
     epst = normx0/normv * epsi
   else
     epst = epsi
   endif
   iepst = 1.0_wp/epst

   ! tmp = x0 + epsi*fx0
   call tmp%alt(x0,epst,v)
   ! compute the perturbed value f( x0 + e*v )
   call ode%rhs(jv,t,tmp,ods , term)
   ! complete the finite difference
   call jv%inlt(-1.0_wp,fx0)
   call jv%tims(iepst)

 end subroutine directional

!-----------------------------------------------------------------------

end module mod_expo

